directory <- "/Users/zhangbohan/Documents/R/file_all"
sampleFiles <- grep("lobe",list.files(directory),value=TRUE)
sampleCondition <- sub("(.*lobe).*","\\1",sampleFiles)
sampleTable <- data.frame(sampleName = sampleFiles,
fileName = sampleFiles,
condition = sampleCondition)
sampleTable$condition <- factor(sampleTable$condition)
library("DESeq2")
dds <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
directory = directory,
design= ~ condition)
dds
class: DESeqDataSet
dim: 60483 215
metadata(1): version
assays(1): counts
rownames(60483): ENSG00000000003.13 ENSG00000000005.5 ... ENSGR0000280767.1 ENSGR0000281849.1
rowData names(0):
colnames(215): lowerlobe-TCGA-18-3421-01A-01R-0980-07.count lowerlobe-TCGA-18-4721-01A-01R-1443-07.count ...
upperlobe-TCGA-NJ-A4YQ-01A-11R-A262-07.count upperlobe-TCGA-NJ-A55R-01A-11R-A262-07.count
colData names(1): condition
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
dds$condition <- factor(dds$condition, levels = c("upperlobe","lowerlobe"))
dds$condition <- droplevels(dds$condition)
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 7234 genes
-- DESeq argument 'minReplicatesForReplace' = 7
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
dds
class: DESeqDataSet
dim: 50475 215
metadata(1): version
assays(6): counts mu ... replaceCounts replaceCooks
rownames(50475): ENSG00000000003.13 ENSG00000000005.5 ... ENSG00000281918.1 ENSG00000281920.1
rowData names(23): baseMean baseVar ... maxCooks replace
colnames(215): lowerlobe-TCGA-18-3421-01A-01R-0980-07.count lowerlobe-TCGA-18-4721-01A-01R-1443-07.count ...
upperlobe-TCGA-NJ-A4YQ-01A-11R-A262-07.count upperlobe-TCGA-NJ-A55R-01A-11R-A262-07.count
colData names(3): condition sizeFactor replaceable
library("IHW")
res <- results(dds, filterFun=ihw, contrast=c("condition","upperlobe","lowerlobe"), alpha=0.05)
res
log2 fold change (MLE): condition upperlobe vs lowerlobe
Wald test p-value: condition upperlobe vs lowerlobe
DataFrame with 50475 rows and 7 columns
baseMean log2FoldChange lfcSE stat pvalue padj weight
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
ENSG00000000003.13 3530.58158 -0.14349078 0.1152011 -1.2455675 0.2129232 1.000000 0.470066
ENSG00000000005.5 1.83533 0.92124505 0.3983635 2.3125742 0.0207461 0.304791 1.634073
ENSG00000000419.11 1988.56928 -0.02316500 0.0859819 -0.2694171 0.7876088 1.000000 0.470066
ENSG00000000457.12 1042.05870 0.00836846 0.0679910 0.1230819 0.9020422 1.000000 0.424619
ENSG00000000460.15 610.06415 -0.00423868 0.1112523 -0.0380997 0.9696082 1.000000 0.582609
... ... ... ... ... ... ... ...
ENSG00000281909.1 0.557662 0.323687 0.365305 0.886074 0.3755778 1.000000 0.00000
ENSG00000281910.1 0.291194 -0.186074 0.524813 -0.354553 0.7229245 1.000000 0.00000
ENSG00000281912.1 60.828380 0.226105 0.145742 1.551401 0.1208055 0.446624 3.94932
ENSG00000281918.1 2.456463 -0.573625 0.230559 -2.487977 0.0128472 0.239031 1.74462
ENSG00000281920.1 6.473478 0.222705 0.220144 1.011633 0.3117135 0.742966 2.29753
enemble_id <- substr(row.names(dds),1,15)
rownames(res) <- enemble_id
RawCounts <- res
Ensembl_ID <- data.frame(Ensembl_ID = row.names(RawCounts))
rownames(Ensembl_ID) <- Ensembl_ID[,1]
RawCounts <- cbind(Ensembl_ID,RawCounts)
get_map = function(input) {
if (is.character(input)) {
if(!file.exists(input)) stop("Bad input file.")
message("Treat input as file")
input = data.table::fread(input, header = FALSE)
} else {
data.table::setDT(input)
}
input = input[input[[3]] == "gene", ]
pattern_id = ".*gene_id \"([^;]+)\";.*"
pattern_name = ".*gene_name \"([^;]+)\";.*"
gene_id = sub(pattern_id, "\\1",input[[9]])
gene_name = sub(pattern_name, "\\1", input[[9]])
Ensembl_ID_TO_Genename <- data.frame(gene_id = gene_id, gene_name = gene_name, stringsAsFactors = FALSE)
return(Ensembl_ID_TO_Genename)
}
Ensembl_ID_TO_Genename <- get_map("gencode.v38.basic.annotation.gtf")
Treat input as file
|--------------------------------------------------|
|==================================================|
gtf_Ensembl_ID <- substr(Ensembl_ID_TO_Genename[,1],1,15)
Ensembl_ID_TO_Genename <- data.frame(gtf_Ensembl_ID, Ensembl_ID_TO_Genename[,2])
colnames(Ensembl_ID_TO_Genename) <- c("Ensembl_ID","gene_id")
write.csv(Ensembl_ID_TO_Genename,file = "Ensembl_ID_TO_Genename.csv")
#replace
res_g <-merge(Ensembl_ID_TO_Genename,RawCounts,by="Ensembl_ID")
res_g <- res_g[order(res_g[,"gene_id"]),]
index <- duplicated(res_g$gene_id)
res_g <- res_g[!index,]
rownames(res_g) <- res_g[,"gene_id"]
res_g <- res_g[,-c(1:2)]
head(res_g)
resultsNames(dds)
[1] "Intercept" "condition_lowerlobe_vs_upperlobe"
resLFC <- lfcShrink(dds,coef="condition_lowerlobe_vs_upperlobe", type="apeglm")
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
resLFC
log2 fold change (MAP): condition lowerlobe vs upperlobe
Wald test p-value: condition lowerlobe vs upperlobe
DataFrame with 50475 rows and 5 columns
baseMean log2FoldChange lfcSE pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric>
ENSG00000000003.13 3530.58158 1.13928e-05 0.00144260 0.2129232 0.733175
ENSG00000000005.5 1.83533 -4.76141e-07 0.00144268 0.0207461 0.325044
ENSG00000000419.11 1988.56928 -2.27604e-04 0.00145166 0.7876088 0.963137
ENSG00000000457.12 1042.05870 -3.55648e-06 0.00144237 0.9020422 0.983322
ENSG00000000460.15 610.06415 -7.31588e-06 0.00144258 0.9696082 0.996177
... ... ... ... ... ...
ENSG00000281909.1 0.557662 -3.19932e-06 0.00144269 0.3755778 NA
ENSG00000281910.1 0.291194 3.22278e-07 0.00144269 0.7229245 NA
ENSG00000281912.1 60.828380 -9.78217e-06 0.00144264 0.1208055 0.633484
ENSG00000281918.1 2.456463 8.99681e-06 0.00144268 0.0128472 0.269497
ENSG00000281920.1 6.473478 -4.69361e-06 0.00144267 0.3117135 0.800859
library("BiocParallel")
register(MulticoreParam(4))
resOrdered <- res_g[order(res_g$pvalue),]
sum(res_g$padj < 0.05, na.rm=TRUE)
[1] 433
plotMA(res, ylim=c(-2,2))
plotMA(resLFC, ylim=c(-2,2))
resultsNames(dds)
[1] "Intercept" "condition_lowerlobe_vs_upperlobe"
resNorm <- lfcShrink(dds, coef=2, type="normal")
using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
Reference: https://doi.org/10.1093/bioinformatics/bty895
resAsh <- lfcShrink(dds, coef=2, type="ashr")
using 'ashr' for LFC shrinkage. If used in published research, please cite:
Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
https://doi.org/10.1093/biostatistics/kxw041
par(mfrow=c(1,3), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-3,3)
plotMA(resLFC, xlim=xlim, ylim=ylim, main="apeglm")
plotMA(resNorm, xlim=xlim, ylim=ylim, main="normal")
plotMA(resAsh, xlim=xlim, ylim=ylim, main="ashr")
plotCounts(dds, gene=which.min(res$padj), intgroup="condition")
d <- plotCounts(dds, gene=which.min(res$padj), intgroup="condition",
returnData=TRUE)
library("ggplot2")
Attaching package: ‘ggplot2’
The following object is masked from ‘package:IHW’:
alpha
ggplot(d, aes(x=condition, y=count)) +
geom_point(position=position_jitter(w=0.1,h=0)) +
scale_y_log10(breaks=c(25,100,400))
mcols(res)$description
[1] "mean of normalized counts for all samples"
[2] "log2 fold change (MLE): condition upperlobe vs lowerlobe"
[3] "standard error: condition upperlobe vs lowerlobe"
[4] "Wald statistic: condition upperlobe vs lowerlobe"
[5] "Wald test p-value: condition upperlobe vs lowerlobe"
[6] "Weighted BH adjusted p-values"
[7] "IHW weights"
getwd()
[1] "/Users/zhangbohan/Documents/R/script"
write.csv(as.data.frame(res_g),
file="res_g.csv")
resSig_0.05 <- subset(resOrdered, padj < 0.05)
resSig_0.05[which(resSig_0.05$log2FoldChange > 0), "up_down"] <- "up"
resSig_0.05[which(resSig_0.05$log2FoldChange < 0), "up_down"] <- "down"
resSig_0.05
getwd()
[1] "/Users/zhangbohan/Documents/R/script"
write.csv(as.data.frame(resSig_0.05),
file="differential_expression.csv")
vsd <- vst(dds, blind=FALSE)
rld <- rlog(dds, blind=FALSE)
head(assay(vsd), 3)
# this gives log2(n + 1)
ntd <- normTransform(dds)
library("vsn")
meanSdPlot(assay(ntd))
meanSdPlot(assay(vsd))
library("RColorBrewer")
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$condition, vsd$type, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors, show_rownames = FALSE)
library("pheatmap")
select <- order(rowMeans(counts(dds,normalized=TRUE)),
decreasing=TRUE)[1:20]
df <- as.data.frame(colData(dds)[,c("condition", "sizeFactor")])
pheatmap(assay(ntd)[select,], cluster_rows=FALSE, show_rownames=FALSE,
cluster_cols=FALSE, annotation_col=df, show_colnames = FALSE)
pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=FALSE,
cluster_cols=FALSE, annotation_col=df, show_colnames = FALSE)
pheatmap(assay(rld)[select,], cluster_rows=FALSE, show_rownames=FALSE,
cluster_cols=FALSE, annotation_col=df)
sampleDists <- dist(t(assay(vsd)))
library("RColorBrewer")
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$condition, vsd$type, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors, show_rownames = FALSE)
plotPCA(vsd, intgroup="condition")
pcaData <- plotPCA(vsd, intgroup="condition", returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=condition)) +
geom_point(size=3) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed()